This Rmarkdown file assesses the output of CheckV, DeepVirFinder, Kaiju, VIBRANT, VirSorter, and VirSorter2 on multiple training sets of microbial DNA, primarily from NCBI. Created from fungal, viral, bacterial, archeael, protist, and plasmid DNA sequences

Please reach out to James Riddell () or Bridget Hegarty () regarding any issues, or open an issue on github.

library(ggplot2)
library(plyr)
library(reshape2)
library(viridis)
library(tidyr)
library(dplyr)
library(readr)
library(data.table)
library(pROC)
Type 'citation("pROC")' for a citation.

Attaching package: ‘pROC’

The following objects are masked from ‘package:stats’:

    cov, smooth, var
library(here)

Inputs

  1. For this script’s inputs, the user must merge each viral identification tool output file into one tab-separated file. Replace the filenames with their paths if needed into the variables in the ‘inputs’ chunk.

    General filenames for each tool: Checkv: quality_summary.tsv VIBRANT: VIBRANT_genome_quality_${assembly}.tsv Virsorter: VIRSorter_global-phage-signal.csv Virsorter2: final-viral-score.tsv DeepVirFinder: ${assembly}.fasta_gt2500bp_dvfpred.txt Kaiju: ${assembly}.kaiju.names.out

  2. Make sure all fasta headers (assembly, contig) are consistent within and across tools. Chunks for each tool do contain some lines for cleaning up these features, but due to their variability it will be the user’s responsibility to make sure they match across tools.

  3. Check each chunk and ensure all columns are accounted for.

  4. This script is designed for contigs > 3000 basepairs. It can be modified to be higher or lower, but going lower will greatly increase the size of the dataframe and memory usage.

# dataset name for file organization and outputs for metrics file
dataset_name <- "testing_set"
# checkV
checkV_path <- "../ToolOutput/testing_sets_checkv_output.tsv"
# VIBRANT
vibrant_path <- "../ToolOutput/testing_sets_vibrant_output.tsv"
# DeepVirFinder
dvf_path <- "../ToolOutput/testing_sets_deepvirfinder_output.tsv"
# Virsorter
vs_path <- "../ToolOutput/testing_sets_virsorter_output.tsv"
# Virsorter2
vs2_path <- "../ToolOutput/testing_sets_virsorter2_output.tsv"
# Kaiju
kj_path <- "../ToolOutput/merged.nreuk.kaiju.names.out"
# KB cutoff
KB_CUTOFF <- 3000

All Viral Contigs

checkV

checkV <- fread(checkV_path, 
                sep="\t",
                header = T, 
                select = c(
                    'Index',
                    'contig_id',
                    'provirus',
                    'completeness',
                    'contamination',
                    'viral_genes',
                    'host_genes',
                    'gene_count',
                    'contig_length',
                    'checkv_quality'
                    )
                ) %>% 
    rename(
        contig = contig_id,
        checkv_provirus = provirus,
        checkv_completeness = completeness,
        checkv_contamination = contamination,
        checkv_viral_genes = viral_genes,
        checkv_host_genes = host_genes,
        checkv_total_genes = gene_count,
        checkv_length = contig_length
        )
checkV$method = 'checkv'
checkV <- separate(checkV, col = contig, into = c("seqtype", "contig"), sep="--")
checkV$contig <- sub("\\.", "_", checkV$contig)
checkV$contig <- sub("\\|", "_", checkV$contig)
checkV <- checkV[!duplicated(paste(checkV$contig, checkV$Index)),]

VIBRANT

vb_c <- fread(vibrant_path,
              header = T,
              sep = "\t",
              select = c(
                  'Index',
                  'scaffold',
                  'type',
                  'Quality'
                  )
              ) %>%
    rename(
        contig = scaffold,
        vibrant_quality = Quality
        )
vb_c$method <- "vibrant"
vb_c$vibrant_prophage <- "No"
vb_c$vibrant_prophage[grep("_fragment_", vb_c$contig)] <- "Yes"
vb_c <- separate(vb_c, col = contig, into = c("seqtype", "contig"), sep="--")
vb_c$contig <- gsub("_fragment_.*", "", vb_c$contig)
vb_c$contig <- sub("\\.", "_", vb_c$contig)
vb_c <- separate(vb_c, col=contig, into="contig", remove=T, sep = " ")
vb_c <- vb_c[!duplicated(paste(vb_c$contig, vb_c$Index)),]

DeepVirFinder

dvf_c <- fread(dvf_path,
               header = T,
               sep = "\t",
               select = c(
                   'Index',
                   'name',
                   'score',
                   'pvalue'
                   )
               ) %>% 
    rename(
        contig = name
        )
dvf_c$contig <- sub("\\.", "_", dvf_c$contig)
dvf_c <- separate(dvf_c, col = contig, into = c("seqtype", "contig"), sep="--")
dvf_c$bh_pvalue <- p.adjust(dvf_c$pvalue, method="BH")
dvf_c <- dvf_c[!duplicated(paste(dvf_c$contig, dvf_c$Index)),]

VirSorter

vs_c <- fread(vs_path,
              select = c(
                  'index',
                  'contig',
                  'category'
                  )
              ) %>% 
    rename(
        Index = index
        )
vs_c$contig <- sub("VIRSorter_", "", vs_c$contig)
vs_c$contig <- sub("-circular", "", vs_c$contig)
vs_c <- separate(vs_c, col = `contig`, into = c("seqtype", "contig"), sep="--")
vs_c$contig <- sub("\\.", "_", vs_c$contig)
vs_c <- vs_c %>% drop_na(contig)
vs_c <- vs_c[!duplicated(paste(vs_c$contig, vs_c$Index)),]

VirSorter2

vs2_c <- fread(vs2_path,
                    header = T,
                    sep = '\t',
                    select = c(
                        'Index',
                        'seqname',
                        'dsDNAphage',
                        'ssDNA',
                        'max_score',
                        'max_score_group',
                        'hallmark',
                        'viral',
                        'cellular'
                    )
               ) %>% 
              separate(
                  col = seqname,
                  into = c("contig", "vs2type"), 
                  sep = "\\|\\|",
                  remove = T
                  )
vs2_c$contig <- sub("\\.", "_", vs2_c$contig)
vs2_c <- separate(vs2_c, col = contig, into = c("seqtype", "contig"), sep="--")
vs2_c <- vs2_c[!duplicated(paste(vs2_c$contig, vs2_c$Index)),]

Kaiju

kj_c <- read_tsv(kj_path, col_names = T)
Warning: One or more parsing issues, see `problems()` for details
Rows: 96361 Columns: 9
── Column specification ───────────────────────────────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (5): Classified, Contig, IDs_all, Seq, Name
dbl (3): Index, NCBI_taxon, len

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
kj_c$Contig <- sub("\\.", "_", kj_c$Contig)
kj_c <- separate(kj_c, col = Contig, into = c("seqtype", "contig"), sep="--")
kj_c <- separate(kj_c, col = Name, into = c("Kaiju_Viral","Kingdom"), sep=";")
Warning: Expected 2 pieces. Additional pieces discarded in 84573 rows [2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, ...].
kj_c$contig <- sub("\\.", "_", kj_c$contig)
kj_c <- kj_c[!duplicated(paste(kj_c$contig, kj_c$Index)),]

Quick glimpse of cellular vs viral sequence classification by Kaiju by assembly

table(kj_c$seqtype, kj_c$Kaiju_Viral)
          
           cellular organisms Viruses
  archaea                9705      41
  bacteria              59945     164
  fungi                   861     104
  plasmid                4972       3
  protist                 557       2
  virus                  4116    4099

Merging

viruses <- full_join(x=checkV, y=kj_c, by = c("Index", "contig", "seqtype"))
viruses <- full_join(x=viruses, y=dvf_c, by = c("Index", "contig", "seqtype"))
viruses <- full_join(x=viruses, y=vb_c, by = c("Index", "contig", "seqtype"))
viruses <- full_join(x=viruses, y=vs_c, by = c("Index", "contig", "seqtype"))
viruses <- full_join(x=viruses, y=vs2_c, by = c("Index", "contig", "seqtype"))

Remove contigs not greater than the basepair length cutoff (3000)

viruses <- viruses %>% filter(checkv_length > KB_CUTOFF)

Use the next code chunk to check if any contigs are missing from checkv. If yes, then there is a mix up in contig names since checkv should contain all contig names even if they are identified as non-viral.

v_missing <- viruses[is.na(viruses$checkv_uniq_contig),]

calculate percent viral/host/unknown

viruses$percent_host <- viruses$checkv_host_genes/viruses$checkv_total_genes*100
viruses$percent_viral <- viruses$checkv_viral_genes/viruses$checkv_total_genes*100
viruses$percent_unknown <- 100-(viruses$checkv_host_genes+viruses$checkv_viral_genes)/viruses$checkv_total_genes*100

get rid of NAs for downstream processing

# checkV
viruses$percent_viral[is.na(viruses$percent_viral)] <- 0
viruses$percent_unknown[is.na(viruses$percent_unknown)] <- 0
# VIBRANT
viruses$vibrant_quality[is.na(viruses$vibrant_quality)] <- 0
# DeepVirFinder
viruses$score[is.na(viruses$score)] <- 0
viruses$bh_pvalue[is.na(viruses$bh_pvalue)] <- 0
# VirSorter2
viruses$viral[is.na(viruses$viral)] <- 0
viruses$hallmark[is.na(viruses$hallmark)] <- 0
#Virsorter
viruses$category[is.na(viruses$category)] <- 0
# Kaiju
viruses$Kaiju_Viral[is.na(viruses$Kaiju_Viral)] <- "unknown"
viruses$Kingdom[is.na(viruses$Kingdom)] <- "unknown"

This section defines a viralness score “keep_score” based on the tool classifications. A final keep_score above 1 indicates we will keep that sequence and call it viral.

VIBRANT Quality == “High Quality Draft”: +1 Quality == “Medium Quality Draft”: +1 Quality == “Low Quality Draft” & provirus == TRUE: +0.5

Virsorter2 Viral >= 50: +0.5 Viral >= 0.95: +0.5

Virsorter category == 1,2,4,5: +1 category == 3,6: +0.5

DeepVirFinder: Score >= 0.7: +0.5 Score >= 0.9: +0.5

Kaiju: Kaiju_viral = “cellular organisms”: -1 Kaiju_viral = “Viruses”: +1

CheckV If %unknown >= 75: +0.5 Hallmark > 2: +1 viral_genes == 0 and host_genes >= 1: keep_score = 0 If 3*viral_genes <= host_genes: keep_score = 0 If length > 50,000 and hallmark == 0: keep_score = 0

This script produces visualizations of these combined viral scorings and includes ecological metrics like alpha diversity.

You can decide which combination is appropriate for them and only need use the tools appropriate for your data.

getting_viral_set_1 <- function(input_seqs,
                                include_vibrant=FALSE, 
                                include_virsorter2=FALSE,
                                include_deepvirfinder=FALSE,
                                include_tuning=FALSE,
                                include_kaiju=FALSE,
                                include_virsorter=FALSE) {
  
  keep_score <- rep(0, nrow(input_seqs))
  
  if (include_vibrant) {
    keep_score[input_seqs$vibrant_quality=="high quality draft"] <- keep_score[input_seqs$vibrant_quality=="high quality draft"] + 1
    keep_score[input_seqs$vibrant_quality=="medium quality draft"] <- keep_score[input_seqs$vibrant_quality=="medium quality draft"] + 1
    keep_score[input_seqs$vibrant_quality=="low quality draft" & input_seqs$provirus=="Yes"] <- keep_score[input_seqs$vibrant_quality=="low quality draft" & input_seqs$provirus=="Yes"] + 0.5
#    keep_score[input_seqs$vibrant_quality=="low quality draft"] <- keep_score[input_seqs$vibrant_quality=="low quality draft"] + 0.5
  }
  
  if (include_virsorter2) {
    keep_score[input_seqs$viral>=50] <- keep_score[input_seqs$viral>=50] + 0.5
    keep_score[input_seqs$viral>=95] <- keep_score[input_seqs$viral>=95] + 0.5
  }
  
  if (include_virsorter) {
    keep_score[input_seqs$category==1] <- keep_score[input_seqs$category==1] + 1
    keep_score[input_seqs$category==2] <- keep_score[input_seqs$category==2] + 1
    keep_score[input_seqs$category==3] <- keep_score[input_seqs$category==3] + 0.5
    keep_score[input_seqs$category==4] <- keep_score[input_seqs$category==4] + 1
    keep_score[input_seqs$category==5] <- keep_score[input_seqs$category==5] + 1
    keep_score[input_seqs$category==6] <- keep_score[input_seqs$category==6] + 0.5
  }
  
  if (include_deepvirfinder) {
    keep_score[input_seqs$score>=0.7 & input_seqs$contig_length<20000] <- keep_score[input_seqs$score>=0.7 & input_seqs$contig_length<20000] + 0.5
   keep_score[input_seqs$score>=0.9 & input_seqs$contig_length<20000] <- keep_score[input_seqs$score>=0.9 & input_seqs$contig_length<20000] + 0.5
  }
  
  if (include_kaiju) {
    keep_score[input_seqs$Kaiju_Viral=="cellular organisms"] <- keep_score[input_seqs$Kaiju_Viral=="cellular organisms"] - 1
    keep_score[input_seqs$Kaiju_Viral=="Viruses"] <- keep_score[input_seqs$Kaiju_Viral=="Viruses"] + 1
  }
  
  if (include_tuning) {
    keep_score[input_seqs$hallmark>2] <- keep_score[input_seqs$hallmark>2] + 1
    keep_score[input_seqs$percent_unknown>=75] <- keep_score[input_seqs$percent_unknown>=75] + 0.5
    keep_score[input_seqs$percent_viral>=50] <- keep_score[input_seqs$percent_viral>=50] + 0.5
    #keep_score[input_seqs$hallmark>=(input_seqs$checkv_viral_genes/5)] <- keep_score[input_seqs$hallmark>=(input_seqs$checkv_viral_genes/5)] + 1 #add some ratio
    keep_score[input_seqs$viral_genes==0 & input_seqs$host_genes>=1] <- 0
    keep_score[((input_seqs$viral_genes*3) <= input_seqs$host_genes) & input_seqs$checkv_provirus=="No"] <- 0 # consider accounting for provirus designation
#    keep_score[(input_seqs$viral_genes*3) <= input_seqs$host_genes] <- 0 # consider accounting for provirus designation
    keep_score[input_seqs$contig_length>50000 & input_seqs$hallmark==0] <- 0
  }
  
  return(keep_score)
  
}

Assessing performance against the “truth”

note that this is only as accurate as the annotations of the input sequences

this function calculates the precision, recall, and F1 score for each pipeline

assess_performance <- function(seqtype, keep_score) {
  
  truepositive <- rep("not viral", length(seqtype))
  truepositive[seqtype=="virus"] <- "viral"
  
  #make confusion matrix
  confusion_matrix <- rep("true negative", length(keep_score))
  confusion_matrix[truepositive=="viral" & keep_score<=1] <- "false negative"
  confusion_matrix[truepositive=="viral" & keep_score>=1] <- "true positive"
  confusion_matrix[truepositive=="not viral" & keep_score>=1] <- "false positive"
  
  TP <- table(confusion_matrix)[4]
  FP <- table(confusion_matrix)[2]
  TN <- table(confusion_matrix)[3]
  FN <- table(confusion_matrix)[1]
  
  precision <- TP/(TP+FP)
  recall <- TP/(TP+FN)
  F1 <- 2*precision*recall/(precision+recall)
  
  MCC <- (TP*TN-FP*FN)/sqrt(as.numeric(TP+FP)*as.numeric(TP+FN)*as.numeric(TN+FP)*as.numeric(TN+FN))
  
  AUC <- round(auc(truepositive, keep_score),4)
  
  #by type metrics
  fungal_FP <- table(confusion_matrix[seqtype=="fungi"])[2]
  bacterial_FP <- table(confusion_matrix[seqtype=="bacteria"])[2]
  viral_FN <- table(confusion_matrix[seqtype=="virus"])[1]
  
  performance <- c(precision, recall, F1, MCC, AUC, fungal_FP, bacterial_FP, viral_FN)
  names(performance) <- c("precision", "recall", "F1", "MCC", "AUC", "fungal_FP",
                          "bacterial_FP", "viral_FN")
  
  return(performance)
}

combination of tools list

combos_list <- read_csv("combinations_list.csv", col_names = T)
Rows: 63 Columns: 6
── Column specification ───────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
dbl (6): CheckV, DVF, Kaiju, VIBRANT, VS, VS2

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

this function builds a list of all of the combinations that the user wants to test. In this case, we’re comparing the performance of all unique combinations of the six tools.

build_score_list <- function(input_seqs, combos) {
  output <- data.frame(precision=rep(0, nrow(combos)),
                       recall=rep(0, nrow(combos)),
                       F1=rep(0, nrow(combos)),
                       MCC=rep(0, nrow(combos)),
                       AUC=rep(0, nrow(combos)),
                       fungal_FP=rep(0, nrow(combos)),
                       bacterial_FP=rep(0, nrow(combos)),
                       viral_FN=rep(0, nrow(combos)))
  for (i in 1:nrow(combos)) {
    keep_score <- getting_viral_set_1(input_seqs, include_vibrant = combos$VIBRANT[i],
                                            include_virsorter = combos$VS[i],
                                            include_virsorter2 = combos$VS2[i],
                                            include_tuning = combos$CheckV[i],
                                            include_kaiju = combos$Kaiju[i],
                                            include_deepvirfinder = combos$DVF[i])
  
    output[i,1:8] <- assess_performance(input_seqs$seqtype, keep_score)
    
    output$toolcombo[i] <- paste(combos$CheckV[i],combos$DVF[i],
                                 combos$Kaiju[i], combos$VIBRANT[i],
                                 combos$VS[i], combos$VS2[i])
  }
  
  output[is.na(output)] <- 0
  
  #return(keep_score)
  return (output)
}

Calculate the performance of each pipeline

accuracy_scores <- build_score_list(viruses[viruses$Index==1], combos_list)
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
accuracy_scores <- data.frame(testing_set_index=rep(0, nrow(combos_list)*10),
                      precision=rep(0, nrow(combos_list)*10),
                       recall=rep(0, nrow(combos_list)*10),
                       F1=rep(0, nrow(combos_list)*10),
                       MCC=rep(0, nrow(combos_list)*10), 
                      AUC=rep(0, nrow(combos_list)*10),
                      fungal_FP=rep(0, nrow(combos_list)*10),
                      bacterial_FP=rep(0, nrow(combos_list)*10),
                      viral_FN=rep(0, nrow(combos_list)*10))
accuracy_scores <- cbind(testing_set_index=rep(1, nrow(combos_list)),
                              build_score_list(viruses[viruses$Index==1], combos_list))
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
for (i in 2:10) {
  accuracy_scores <- rbind(accuracy_scores,
                           cbind(testing_set_index=rep(i, nrow(combos_list)),
                              build_score_list(viruses[viruses$Index==i], combos_list)))
}
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
Setting levels: control = not viral, case = viral
Setting direction: controls < cases
library("stringr")
accuracy_scores$numtools <- str_count(accuracy_scores$toolcombo, "1")
#accuracy_scores <- accuracy_scores[order(accuracy_scores$numtools, decreasing=F),]
accuracy_scores <- accuracy_scores[order(accuracy_scores$MCC, decreasing=F),]
accuracy_scores$toolcombo <- factor(accuracy_scores$toolcombo, levels = unique(accuracy_scores$toolcombo))
accuracy_scores$numtools <- as.factor(accuracy_scores$numtools)

Visualize how the precision, recall, and F1 scores change across pipelines.

pal <- ggthemes::tableau_color_pal(palette="Tableau 10", type="regular")
p2 <- ggplot(accuracy_scores, aes(x=toolcombo, y=F1, 
                                  color=numtools, fill=numtools)) +
  geom_point(alpha=0.5) +
  theme_light() +
  theme(
    panel.grid.major.y = element_blank(),
    panel.border = element_blank(),
    axis.ticks.y = element_blank(),
    legend.position = "bottom",
    axis.text.y=element_text(size=14),
    axis.text.x=element_text(size=14, angle = 90),
    legend.text=element_text(size=12),
    axis.title=element_text(size=16),
  ) +
  xlab("Tool Combination (CV, DVF, KJ, VB, VS, VS2)") +
  ylab("F1 Score")
p2

ggplot(accuracy_scores, aes(x=toolcombo, y=precision, 
                                  color=numtools, fill=numtools)) +
  geom_point(alpha=0.5) +
  theme_light() +
  theme(
    panel.grid.major.y = element_blank(),
    panel.border = element_blank(),
    axis.ticks.y = element_blank(),
    legend.position = "bottom",
    axis.text.y=element_text(size=14),
    axis.text.x=element_text(size=14, angle = 90),
    legend.text=element_text(size=12),
    axis.title=element_text(size=16),
  ) +
  xlab("Tool Combination (CV, DVF, KJ, VB, VS, VS2)") +
  ylab("Precision")

ggplot(accuracy_scores, aes(x=toolcombo, y=recall, 
                                  color=numtools, fill=numtools)) +
  geom_point(alpha=0.5) +
  theme_light() +
  theme(
    panel.grid.major.y = element_blank(),
    panel.border = element_blank(),
    axis.ticks.y = element_blank(),
    legend.position = "bottom",
    axis.text.y=element_text(size=14),
    axis.text.x=element_text(size=14, angle = 90),
    legend.text=element_text(size=12),
    axis.title=element_text(size=16),
  ) +
  xlab("Tool Combination (CV, DVF, KJ, VB, VS, VS2)") +
  ylab("Recall")

ggplot(accuracy_scores, aes(x=precision, y=recall, 
                                  color=numtools, fill=numtools)) +
  geom_point(alpha=0.5) +
  theme_light() +
  theme(
    panel.grid.major.y = element_blank(),
    panel.border = element_blank(),
    axis.ticks.y = element_blank(),
    legend.position = "bottom",
    axis.text.y=element_text(size=14),
    axis.text.x=element_text(size=14, angle = 90),
    legend.text=element_text(size=12),
    axis.title=element_text(size=16),
  ) +
  xlab("Precision") +
  ylab("Recall")

ggplot(accuracy_scores, aes(x=toolcombo, y=abs(precision-recall), 
                                  color=numtools, fill=numtools)) +
  geom_point(alpha=0.5) +
  theme_light() +
  theme(
    panel.grid.major.y = element_blank(),
    panel.border = element_blank(),
    axis.ticks.y = element_blank(),
    legend.position = "bottom",
    axis.text.y=element_text(size=14),
    axis.text.x=element_text(size=14, angle = 90),
    legend.text=element_text(size=12),
    axis.title=element_text(size=16),
  ) +
  xlab("Tool Combination (CV, DVF, KJ, VB, VS, VS2)") +
  ylab("Precision-Recall")

ggplot(accuracy_scores, aes(x=toolcombo, y=MCC, 
                                  color=numtools, fill=numtools)) +
  geom_point(alpha=0.5) +
  theme_light() +
  theme(
    panel.grid.major.y = element_blank(),
    panel.border = element_blank(),
    axis.ticks.y = element_blank(),
    legend.position = "bottom",
    axis.text.y=element_text(size=14),
    axis.text.x=element_text(size=14, angle = 90),
    legend.text=element_text(size=12),
    axis.title=element_text(size=16),
  ) +
  xlab("Tool Combination (CV, DVF, KJ, VB, VS, VS2)") +
  ylab("MCC")

ggplot(accuracy_scores, aes(x=toolcombo, y=AUC, 
                                  color=numtools, fill=numtools)) +
  geom_point(alpha=0.5) +
  theme_light() +
  theme(
    panel.grid.major.y = element_blank(),
    panel.border = element_blank(),
    axis.ticks.y = element_blank(),
    legend.position = "bottom",
    axis.text.y=element_text(size=14),
    axis.text.x=element_text(size=14, angle = 90),
    legend.text=element_text(size=12),
    axis.title=element_text(size=16),
  ) +
  xlab("Tool Combination (CV, DVF, KJ, VB, VS, VS2)") +
  ylab("AUC")

ggplot(accuracy_scores, aes(x=toolcombo, y=fungal_FP, 
                                  color=numtools, fill=numtools)) +
  geom_point(alpha=0.5) +
  theme_light() +
  theme(
    panel.grid.major.y = element_blank(),
    panel.border = element_blank(),
    axis.ticks.y = element_blank(),
    legend.position = "bottom",
    axis.text.y=element_text(size=14),
    axis.text.x=element_text(size=14, angle = 90),
    legend.text=element_text(size=12),
    axis.title=element_text(size=16),
  ) +
  xlab("Tool Combination (CV, DVF, KJ, VB, VS, VS2)") +
  ylab("Fungal False Positives")

ggplot(accuracy_scores, aes(x=toolcombo, y=bacterial_FP, 
                                  color=numtools, fill=numtools)) +
  geom_point(alpha=0.5) +
  theme_light() +
  theme(
    panel.grid.major.y = element_blank(),
    panel.border = element_blank(),
    axis.ticks.y = element_blank(),
    legend.position = "bottom",
    axis.text.y=element_text(size=14),
    axis.text.x=element_text(size=14, angle = 90),
    legend.text=element_text(size=12),
    axis.title=element_text(size=16),
  ) +
  xlab("Tool Combination (CV, DVF, KJ, VB, VS, VS2)") +
  ylab("Bacterial False Positives")

ggplot(accuracy_scores, aes(x=toolcombo, y=viral_FN, 
                                  color=numtools, fill=numtools)) +
  geom_point(alpha=0.5) +
  theme_light() +
  theme(
    panel.grid.major.y = element_blank(),
    panel.border = element_blank(),
    axis.ticks.y = element_blank(),
    legend.position = "bottom",
    axis.text.y=element_text(size=14),
    axis.text.x=element_text(size=14, angle = 90),
    legend.text=element_text(size=12),
    axis.title=element_text(size=16),
  ) +
  xlab("Tool Combination (CV, DVF, KJ, VB, VS, VS2)") +
  ylab("Viral False Negatives")

Experimenting

high precision example

viruses$keep_score_high_precision <- getting_viral_set_1(viruses, include_deepvirfinder = F,
                                              include_vibrant = T,
                                              include_virsorter2 = F,
                                              include_kaiju = T,
                                              include_tuning = T,
                                              include_virsorter = F)
viruses$confusion_matrix_high_precision <- "true negative"
viruses$confusion_matrix_high_precision[viruses$seqtype=="virus" & viruses$keep_score_high_precision<=1] <- "false negative"
viruses$confusion_matrix_high_precision[viruses$seqtype=="virus" & viruses$keep_score_high_precision>=1] <- "true positive"
viruses$confusion_matrix_high_precision[viruses$seqtype!="virus" & viruses$keep_score_high_precision>=1] <- "false positive"

visualizing confusion matrix by taxa

confusion_by_taxa <- melt(table(viruses$confusion_matrix_high_precision, viruses$seqtype, viruses$Index))
Warning in melt(table(viruses$confusion_matrix_high_precision, viruses$seqtype,  :
  The melt generic in data.table has been passed a table and will attempt to redirect to the relevant reshape2 method; please note that reshape2 is deprecated, and this redirection is now deprecated as well. To continue using melt methods from reshape2 while both libraries are attached, e.g. melt.list, you can prepend the namespace like reshape2::melt(table(viruses$confusion_matrix_high_precision, viruses$seqtype,     viruses$Index)). In the next version, this warning will become an error.
colnames(confusion_by_taxa) <- c("confusion_matrix", "seqtype","Index", "count")
pal <- ggthemes::tableau_color_pal(palette="Tableau 10", type="regular")
ggplot(confusion_by_taxa, aes(x=count, y=as.factor(Index),
                   fill=confusion_matrix,
                   color=confusion_matrix)) +
  geom_bar(stat="identity") +
  theme_light() +
  theme(
    panel.grid.major.y = element_blank(),
    panel.border = element_blank(),
    axis.ticks.y = element_blank(),
    legend.position = "bottom",
    axis.text.y=element_text(size=14),
    axis.text.x=element_text(size=14),
    legend.text=element_text(size=12),
    axis.title=element_text(size=16),
  ) +
  scale_fill_manual(name="",
                     values = alpha(rev(pal(4)), 0.5),
                    labels=c("false negative", "false positive", 
                             "true negative", "true positive")) +
  scale_color_manual(name="",
                     values = alpha(rev(pal(4)), 1),
                    labels=c("false negative", "false positive", 
                             "true negative", "true positive")) +
  xlab("Number of Sequences") +
  ylab("") + 
  facet_wrap(~seqtype, scales = "free") +
  coord_flip()

 ggplot(viruses, aes(x=checkv_viral_genes, y=confusion_matrix_high_precision,
                   fill=confusion_matrix_high_precision,
                   color=confusion_matrix_high_precision)) +
  geom_boxplot(alpha=0.3) +
  theme_light() +
  theme(
    panel.grid.major.y = element_blank(),
    panel.border = element_blank(),
    axis.ticks.y = element_blank(),
    legend.position = "bottom",
    axis.text.y=element_text(size=14),
    axis.text.x=element_text(size=14),
    legend.text=element_text(size=12),
    axis.title=element_text(size=16),
  ) +
  scale_fill_manual(name="",
                     values = alpha(rev(pal(4)), 0.5),
                    labels=c("false negative", "false positive", 
                             "true negative", "true positive")) +
  scale_color_manual(name="",
                     values = alpha(rev(pal(4)), 1),
                    labels=c("false negative", "false positive", 
                             "true negative", "true positive")) +
  xlab("Number of Viral Sequences") +
  ylab("") + 
  facet_wrap(~seqtype, scales = "free") +
  coord_flip()


 ggplot(viruses, aes(x=percent_viral, y=confusion_matrix_high_precision,
                   fill=confusion_matrix_high_precision,
                   color=confusion_matrix_high_precision)) +
  geom_boxplot(alpha=0.3) +
  theme_light() +
  theme(
    panel.grid.major.y = element_blank(),
    panel.border = element_blank(),
    axis.ticks.y = element_blank(),
    legend.position = "bottom",
    axis.text.y=element_text(size=14),
    axis.text.x=element_text(size=14),
    legend.text=element_text(size=12),
    axis.title=element_text(size=16),
  ) +
  scale_fill_manual(name="",
                     values = alpha(rev(pal(4)), 0.5),
                    labels=c("false negative", "false positive", 
                             "true negative", "true positive")) +
  scale_color_manual(name="",
                     values = alpha(rev(pal(4)), 1),
                    labels=c("false negative", "false positive", 
                             "true negative", "true positive")) +
  xlab("Percent Genes Viral") +
  ylab("") + 
  facet_wrap(~seqtype, scales = "free") +
  coord_flip()


 ggplot(viruses, aes(x=hallmark, y=confusion_matrix_high_precision,
                   fill=confusion_matrix_high_precision,
                   color=confusion_matrix_high_precision)) +
  geom_boxplot(alpha=0.3) +
  theme_light() +
  theme(
    panel.grid.major.y = element_blank(),
    panel.border = element_blank(),
    axis.ticks.y = element_blank(),
    legend.position = "bottom",
    axis.text.y=element_text(size=14),
    axis.text.x=element_text(size=14),
    legend.text=element_text(size=12),
    axis.title=element_text(size=16),
  ) +
  scale_fill_manual(name="",
                     values = alpha(rev(pal(4)), 0.5),
                    labels=c("false negative", "false positive", 
                             "true negative", "true positive")) +
  scale_color_manual(name="",
                     values = alpha(rev(pal(4)), 1),
                    labels=c("false negative", "false positive", 
                             "true negative", "true positive")) +
  xlab("Number of Hallmark Genes") +
  ylab("") + 
  facet_wrap(~seqtype, scales = "free") +
  coord_flip()

 
ggplot(viruses, aes(x=hallmark, y=checkv_viral_genes,
                   fill=confusion_matrix_high_precision,
                   color=confusion_matrix_high_precision)) +
  geom_point(alpha=0.3) +
  theme_light() +
  theme(
    panel.grid.major.y = element_blank(),
    panel.border = element_blank(),
    axis.ticks.y = element_blank(),
    legend.position = "bottom",
    axis.text.y=element_text(size=14),
    axis.text.x=element_text(size=14),
    legend.text=element_text(size=12),
    axis.title=element_text(size=16),
  ) +
  scale_fill_manual(name="",
                     values = alpha(rev(pal(4)), 0.5),
                    labels=c("false negative", "false positive", 
                             "true negative", "true positive")) +
  scale_color_manual(name="",
                     values = alpha(rev(pal(4)), 1),
                    labels=c("false negative", "false positive", 
                             "true negative", "true positive")) +
  xlab("Number of Hallmark Genes") +
  ylab("Number of Viral Genes") + 
  facet_wrap(~seqtype, scales = "free") +
  coord_flip()

viruses_false_positive <- viruses[viruses$confusion_matrix_high_precision=="false positive",]
viruses_false_negative <- viruses[viruses$confusion_matrix_high_precision=="false negative",]
ggplot(viruses, aes(x=hallmark, y=checkv_viral_genes,
                   fill=checkv_length,
                   color=checkv_length,
                   shape=checkv_provirus)) +
  geom_point(alpha=0.3) +
  theme_light() +
  theme(
    panel.grid.major.y = element_blank(),
    panel.border = element_blank(),
    axis.ticks.y = element_blank(),
    legend.position = "bottom",
    axis.text.y=element_text(size=14),
    axis.text.x=element_text(size=14),
    legend.text=element_text(size=12),
    axis.title=element_text(size=16),
  ) +
  xlab("Number of Hallmark Genes") +
  ylab("Number of Viral Genes") + 
  facet_wrap(~seqtype, scales = "free") +
  coord_flip()


ggplot(viruses_false_positive, aes(x=hallmark, y=checkv_length,
                   fill=checkv_viral_genes,
                   color=checkv_viral_genes,
                   shape=checkv_provirus)) +
  geom_point(alpha=0.3) +
  theme_light() +
  theme(
    panel.grid.major.y = element_blank(),
    panel.border = element_blank(),
    axis.ticks.y = element_blank(),
    legend.position = "bottom",
    axis.text.y=element_text(size=14),
    axis.text.x=element_text(size=14),
    legend.text=element_text(size=12),
    axis.title=element_text(size=16),
  ) +
  xlab("Number of Hallmark Genes") +
  ylab("Contig Length") + 
  facet_wrap(~seqtype, scales = "free") +
  coord_flip()


ggplot(viruses_false_positive[viruses_false_positive$seqtype=="bacteria"], aes(x=hallmark, y=checkv_length,
                   fill=checkv_viral_genes,
                   color=checkv_viral_genes,
                   shape=checkv_provirus)) +
  geom_point(alpha=0.3) +
  theme_light() +
  theme(
    panel.grid.major.y = element_blank(),
    panel.border = element_blank(),
    axis.ticks.y = element_blank(),
    legend.position = "bottom",
    axis.text.y=element_text(size=14),
    axis.text.x=element_text(size=14),
    legend.text=element_text(size=12),
    axis.title=element_text(size=16),
  ) +
  xlab("Number of Hallmark Genes") +
  ylab("Contig Length") + 
  facet_wrap(~Kaiju_Viral, scales = "free") +
  coord_flip()


ggplot(viruses_false_positive[viruses_false_positive$seqtype=="fungi"], aes(x=hallmark, y=checkv_length,
                   fill=keep_score_high_precision,
                   color=keep_score_high_precision,
                   shape=checkv_provirus)) +
  geom_point(alpha=0.3) +
  theme_light() +
  theme(
    panel.grid.major.y = element_blank(),
    panel.border = element_blank(),
    axis.ticks.y = element_blank(),
    legend.position = "bottom",
    axis.text.y=element_text(size=14),
    axis.text.x=element_text(size=14),
    legend.text=element_text(size=12),
    axis.title=element_text(size=16),
  ) +
  xlab("Number of Hallmark Genes") +
  ylab("Contig Length") + 
  facet_wrap(~Kaiju_Viral, scales = "free") +
  coord_flip()


ggplot(viruses_false_positive[viruses_false_positive$seqtype=="protist"], aes(x=hallmark, y=checkv_length,
                   fill=checkv_viral_genes,
                   color=checkv_viral_genes,
                   shape=checkv_provirus)) +
  geom_point(alpha=0.3) +
  theme_light() +
  theme(
    panel.grid.major.y = element_blank(),
    panel.border = element_blank(),
    axis.ticks.y = element_blank(),
    legend.position = "bottom",
    axis.text.y=element_text(size=14),
    axis.text.x=element_text(size=14),
    legend.text=element_text(size=12),
    axis.title=element_text(size=16),
  ) +
  xlab("Number of Hallmark Genes") +
  ylab("Contig Length") + 
  facet_wrap(~Kaiju_Viral, scales = "free") +
  coord_flip()


ggplot(viruses_false_negative, aes(x=hallmark, y=checkv_length,
                   fill=checkv_viral_genes,
                   color=checkv_viral_genes,
                   shape=checkv_provirus)) +
  geom_point(alpha=0.3) +
  theme_light() +
  theme(
    panel.grid.major.y = element_blank(),
    panel.border = element_blank(),
    axis.ticks.y = element_blank(),
    legend.position = "bottom",
    axis.text.y=element_text(size=14),
    axis.text.x=element_text(size=14),
    legend.text=element_text(size=12),
    axis.title=element_text(size=16),
  ) +
  xlab("Number of Hallmark Genes") +
  ylab("Contig Length") + 
  facet_wrap(~Kaiju_Viral, scales = "free") +
  coord_flip()


ggplot(viruses_false_negative, aes(x=hallmark, y=checkv_length,
                   fill=keep_score_high_precision,
                   color=keep_score_high_precision,
                   shape=checkv_provirus)) +
  geom_point(alpha=0.3) +
  theme_light() +
  theme(
    panel.grid.major.y = element_blank(),
    panel.border = element_blank(),
    axis.ticks.y = element_blank(),
    legend.position = "bottom",
    axis.text.y=element_text(size=14),
    axis.text.x=element_text(size=14),
    legend.text=element_text(size=12),
    axis.title=element_text(size=16),
  ) +
  xlab("Number of Hallmark Genes") +
  ylab("Contig Length") + 
  facet_wrap(~Kaiju_Viral, scales = "free") +
  coord_flip()

table(viruses$hallmark[viruses$confusion_matrix_high_precision=="false positive"]>0)

FALSE  TRUE 
  243  1703 
table(viruses$percent_host[viruses$confusion_matrix_high_precision=="false positive"]<50)

FALSE  TRUE 
  855  1091 

high recall example

viruses$keep_score_high_recall <- getting_viral_set_1(viruses, include_deepvirfinder = F,
                                              include_vibrant = T,
                                              include_virsorter2 = F,
                                              include_kaiju = T,
                                              include_tuning = T,
                                              include_virsorter = T)
viruses$confusion_matrix <- "true negative"
viruses$confusion_matrix[viruses$seqtype=="virus" & viruses$keep_score_high_recall<=1] <- "false negative"
viruses$confusion_matrix[viruses$seqtype=="virus" & viruses$keep_score_high_recall>=1] <- "true positive"
viruses$confusion_matrix[viruses$seqtype!="virus" & viruses$keep_score_high_recall>=1] <- "false positive"

visualizing confusion matrix by taxa

confusion_by_taxa <- melt(table(viruses$confusion_matrix, viruses$seqtype, viruses$Index))
Warning in melt(table(viruses$confusion_matrix, viruses$seqtype, viruses$Index)) :
  The melt generic in data.table has been passed a table and will attempt to redirect to the relevant reshape2 method; please note that reshape2 is deprecated, and this redirection is now deprecated as well. To continue using melt methods from reshape2 while both libraries are attached, e.g. melt.list, you can prepend the namespace like reshape2::melt(table(viruses$confusion_matrix, viruses$seqtype, viruses$Index)). In the next version, this warning will become an error.
colnames(confusion_by_taxa) <- c("confusion_matrix", "seqtype","Index", "count")
pal <- ggthemes::tableau_color_pal(palette="Tableau 10", type="regular")
p2 <- ggplot(confusion_by_taxa, aes(x=count, y=as.factor(Index),
                   fill=confusion_matrix,
                   color=confusion_matrix)) +
  geom_bar(stat="identity") +
  theme_light() +
  theme(
    panel.grid.major.y = element_blank(),
    panel.border = element_blank(),
    axis.ticks.y = element_blank(),
    legend.position = "bottom",
    axis.text.y=element_text(size=14),
    axis.text.x=element_text(size=14),
    legend.text=element_text(size=12),
    axis.title=element_text(size=16),
  ) +
  scale_fill_manual(name="",
                     values = alpha(rev(pal(4)), 0.5),
                    labels=c("false negative", "false positive", 
                             "true negative", "true positive")) +
  scale_color_manual(name="",
                     values = alpha(rev(pal(4)), 1),
                    labels=c("false negative", "false positive", 
                             "true negative", "true positive")) +
  xlab("Number of Sequences") +
  ylab("") + 
  facet_wrap(~seqtype, scales = "free") +
  coord_flip()
p2

Visualizing confusion matrix by number of tools

viruses$keep_score_visualize <- viruses$keep_score
Warning in set(x, j = name, value = value) :
  Column 'keep_score_visualize' does not exist to remove
viruses$keep_score_visualize[viruses$keep_score>1] <- ">1"
viruses$keep_score_visualize <- factor(viruses$keep_score_visualize, 
                                       levels=c("-0.5", "-1", "0", "0.5","1", ">1"))
viruses$keep_score_visualize <- factor(viruses$keep_score_visualize, 
                                       labels=c("≤ 0", "≤ 0", "≤ 0", "0.5","1", "> 1"))
Error in factor(viruses$keep_score_visualize, labels = c("≤ 0", "≤ 0",  : 
  invalid 'labels'; length 6 should be 1 or 0
levels(factor(viruses$keep_score_visualize))
character(0)
pal <- ggthemes::tableau_color_pal(palette="Tableau 20", type="regular")
p1 <- ggplot(viruses, aes(x=as.factor(Index),
                   fill=keep_score_visualize, color=keep_score_visualize)) +
  geom_bar(stat="count", position="stack") +
  theme_light() +
  coord_flip() +
  theme(
    panel.grid.major.y = element_blank(),
    panel.border = element_blank(),
    axis.ticks.y = element_blank(),
    legend.position = "bottom",
    axis.text.y=element_text(size=14),
    axis.text.x=element_text(size=14),
    legend.text=element_text(size=12),
    axis.title=element_text(size=16)
  ) +
  scale_color_manual(name = 'Viral Score',
                     values = alpha(c(pal(4)), 1)) +
  scale_fill_manual(name = 'Viral Score',
                     values = alpha(c(pal(4)), 0.5)) +
  xlab("Index") +
  ylab("Sequence Count") +
  facet_wrap(~confusion_matrix, scales = "free")
Error in ggplot(viruses, aes(x = as.factor(Index), fill = keep_score_visualize,  : 
  could not find function "ggplot"

ROC

library(pROC)
Type 'citation("pROC")' for a citation.

Attaching package: ‘pROC’

The following objects are masked from ‘package:stats’:

    cov, smooth, var
```{r}
Error: attempt to use zero-length variable name
```{r}
Error: attempt to use zero-length variable name

Sensitivity: The probability that the model predicts a positive outcome for an observation when indeed the outcome is positive. Specificity: The probability that the model predicts a negative outcome for an observation when indeed the outcome is negative.

Pull out Fungi

fungi_1_vb_1_vb_c_fungi <- inner_join(fungi_1,vb_1_vb_c_fungi, on='contig')
Joining, by = c("Index", "seqtype", "contig", "type", "vibrant_quality", "vibrant_prophage")
---
title: "Viral Sequence Sorting Tools Evaluation"
author: Bridget Hegarty, James Riddell
date: 07-22-2022
output: html_notebook
---
This Rmarkdown file assesses the output of CheckV, DeepVirFinder, Kaiju,
VIBRANT, VirSorter, and VirSorter2 on multiple training sets of microbial DNA, 
primarily from NCBI. Created from fungal, viral, bacterial, archeael, protist,
and plasmid DNA sequences

Please reach out to James Riddell (riddell.26@buckeyemail.osu.edu) or
Bridget Hegarty (beh53@case.edu) regarding any issues, or open an issue on github.

```{r setup-library}
library(ggplot2)
library(plyr)
library(reshape2)
library(viridis)
library(tidyr)
library(dplyr)
library(readr)
library(data.table)
library(pROC)
library(here)
```

# Inputs
1) For this script's inputs, the user must merge each viral identification 
tool output file into one tab-separated file. 
Replace the filenames with their paths if needed into the variables in the 
'inputs' chunk.

    General filenames for each tool:
        Checkv: quality_summary.tsv
        VIBRANT: VIBRANT_genome_quality_${assembly}.tsv
        Virsorter: VIRSorter_global-phage-signal.csv
        Virsorter2: final-viral-score.tsv
        DeepVirFinder: ${assembly}.fasta_gt2500bp_dvfpred.txt
        Kaiju: ${assembly}.kaiju.names.out

2) Make sure all fasta headers (assembly, contig) are consistent within and
across tools. Chunks for each tool do contain some lines for cleaning up these 
features, but due to their variability it will be the user's responsibility to 
make sure they match across tools.

3) Check each chunk and ensure all columns are accounted for.

4) This script is designed for contigs > 3000 basepairs. It can be modified
to be higher or lower, but going lower will greatly increase the size of the
dataframe and memory usage.



```{r inputs}
# dataset name for file organization and outputs for metrics file
dataset_name <- "testing_set"
# checkV
checkV_path <- "../ToolOutput/testing_sets_checkv_output.tsv"
# VIBRANT
vibrant_path <- "../ToolOutput/testing_sets_vibrant_output.tsv"
# DeepVirFinder
dvf_path <- "../ToolOutput/testing_sets_deepvirfinder_output.tsv"
# Virsorter
vs_path <- "../ToolOutput/testing_sets_virsorter_output.tsv"
# Virsorter2
vs2_path <- "../ToolOutput/testing_sets_virsorter2_output.tsv"
# Kaiju
kj_path <- "../ToolOutput/merged.nreuk.kaiju.names.out"
# KB cutoff
KB_CUTOFF <- 3000
```

# All Viral Contigs

## checkV
```{r}
checkV <- fread(checkV_path, 
                sep="\t",
                header = T, 
                select = c(
                    'Index',
                    'contig_id',
                    'provirus',
                    'completeness',
                    'contamination',
                    'viral_genes',
                    'host_genes',
                    'gene_count',
                    'contig_length',
                    'checkv_quality'
                    )
                ) %>% 
    rename(
        contig = contig_id,
        checkv_provirus = provirus,
        checkv_completeness = completeness,
        checkv_contamination = contamination,
        checkv_viral_genes = viral_genes,
        checkv_host_genes = host_genes,
        checkv_total_genes = gene_count,
        checkv_length = contig_length
        )
checkV$method = 'checkv'
checkV <- separate(checkV, col = contig, into = c("seqtype", "contig"), sep="--")
checkV$contig <- sub("\\.", "_", checkV$contig)
checkV$contig <- sub("\\|", "_", checkV$contig)
checkV <- checkV[!duplicated(paste(checkV$contig, checkV$Index)),]
```

## VIBRANT
```{r}
vb_c <- fread(vibrant_path,
              header = T,
              sep = "\t",
              select = c(
                  'Index',
                  'scaffold',
                  'type',
                  'Quality'
                  )
              ) %>%
    rename(
        contig = scaffold,
        vibrant_quality = Quality
        )
vb_c$method <- "vibrant"
vb_c$vibrant_prophage <- "No"
vb_c$vibrant_prophage[grep("_fragment_", vb_c$contig)] <- "Yes"
vb_c <- separate(vb_c, col = contig, into = c("seqtype", "contig"), sep="--")
vb_c$contig <- gsub("_fragment_.*", "", vb_c$contig)
vb_c$contig <- sub("\\.", "_", vb_c$contig)
vb_c <- separate(vb_c, col=contig, into="contig", remove=T, sep = " ")
vb_c <- vb_c[!duplicated(paste(vb_c$contig, vb_c$Index)),]
```

## DeepVirFinder
```{r}
dvf_c <- fread(dvf_path,
               header = T,
               sep = "\t",
               select = c(
                   'Index',
                   'name',
                   'score',
                   'pvalue'
                   )
               ) %>% 
    rename(
        contig = name
        )
dvf_c$contig <- sub("\\.", "_", dvf_c$contig)
dvf_c <- separate(dvf_c, col = contig, into = c("seqtype", "contig"), sep="--")
dvf_c$bh_pvalue <- p.adjust(dvf_c$pvalue, method="BH")
dvf_c <- dvf_c[!duplicated(paste(dvf_c$contig, dvf_c$Index)),]
```

## VirSorter
```{r}
vs_c <- fread(vs_path,
              select = c(
                  'index',
                  'contig',
                  'category'
                  )
              ) %>% 
    rename(
        Index = index
        )
vs_c$contig <- sub("VIRSorter_", "", vs_c$contig)
vs_c$contig <- sub("-circular", "", vs_c$contig)
vs_c <- separate(vs_c, col = `contig`, into = c("seqtype", "contig"), sep="--")
vs_c$contig <- sub("\\.", "_", vs_c$contig)
vs_c <- vs_c %>% drop_na(contig)
vs_c <- vs_c[!duplicated(paste(vs_c$contig, vs_c$Index)),]
```


## VirSorter2 
```{r}
vs2_c <- fread(vs2_path,
                    header = T,
                    sep = '\t',
                    select = c(
                        'Index',
                        'seqname',
                        'dsDNAphage',
                        'ssDNA',
                        'max_score',
                        'max_score_group',
                        'hallmark',
                        'viral',
                        'cellular'
                    )
               ) %>% 
              separate(
                  col = seqname,
                  into = c("contig", "vs2type"), 
                  sep = "\\|\\|",
                  remove = T
                  )
vs2_c$contig <- sub("\\.", "_", vs2_c$contig)
vs2_c <- separate(vs2_c, col = contig, into = c("seqtype", "contig"), sep="--")
vs2_c <- vs2_c[!duplicated(paste(vs2_c$contig, vs2_c$Index)),]
```


## Kaiju 
```{r}
kj_c <- read_tsv(kj_path, col_names = T)
kj_c$Contig <- sub("\\.", "_", kj_c$Contig)
kj_c <- separate(kj_c, col = Contig, into = c("seqtype", "contig"), sep="--")
kj_c <- separate(kj_c, col = Name, into = c("Kaiju_Viral","Kingdom"), sep=";")
kj_c$contig <- sub("\\.", "_", kj_c$contig)
kj_c <- kj_c[!duplicated(paste(kj_c$contig, kj_c$Index)),]
```
Quick glimpse of cellular vs viral sequence classification by Kaiju by assembly
```{r}
table(kj_c$seqtype, kj_c$Kaiju_Viral)
```


## Merging
```{r}
viruses <- full_join(x=checkV, y=kj_c, by = c("Index", "contig", "seqtype"))
viruses <- full_join(x=viruses, y=dvf_c, by = c("Index", "contig", "seqtype"))
viruses <- full_join(x=viruses, y=vb_c, by = c("Index", "contig", "seqtype"))
viruses <- full_join(x=viruses, y=vs_c, by = c("Index", "contig", "seqtype"))
viruses <- full_join(x=viruses, y=vs2_c, by = c("Index", "contig", "seqtype"))
```

Remove contigs not greater than the basepair length cutoff (3000)
```{r}
viruses <- viruses %>% filter(checkv_length > KB_CUTOFF)
```


Use the next code chunk to check if any contigs are missing from checkv.
If yes, then there is a mix up in contig names since checkv should contain
all contig names even if they are identified as non-viral.
```{r}
v_missing <- viruses[is.na(viruses$checkv_uniq_contig),]
```

calculate percent viral/host/unknown
```{r}
viruses$percent_host <- viruses$checkv_host_genes/viruses$checkv_total_genes*100
viruses$percent_viral <- viruses$checkv_viral_genes/viruses$checkv_total_genes*100
viruses$percent_unknown <- 100-(viruses$checkv_host_genes+viruses$checkv_viral_genes)/viruses$checkv_total_genes*100
```

get rid of NAs for downstream processing
```{r}
# checkV
viruses$percent_viral[is.na(viruses$percent_viral)] <- 0
viruses$percent_unknown[is.na(viruses$percent_unknown)] <- 0
# VIBRANT
viruses$vibrant_quality[is.na(viruses$vibrant_quality)] <- 0
# DeepVirFinder
viruses$score[is.na(viruses$score)] <- 0
viruses$bh_pvalue[is.na(viruses$bh_pvalue)] <- 0
# VirSorter2
viruses$viral[is.na(viruses$viral)] <- 0
viruses$hallmark[is.na(viruses$hallmark)] <- 0
#Virsorter
viruses$category[is.na(viruses$category)] <- 0
# Kaiju
viruses$Kaiju_Viral[is.na(viruses$Kaiju_Viral)] <- "unknown"
viruses$Kingdom[is.na(viruses$Kingdom)] <- "unknown"
```



This section defines a viralness score "keep_score" based on the tool classifications. 
A final keep_score above 1 indicates we will keep that sequence and call it viral.

VIBRANT
    Quality == "High Quality Draft": +1
    Quality == "Medium Quality Draft": +1
    Quality == "Low Quality Draft" & provirus == TRUE: +0.5

Virsorter2
    Viral >= 50: +0.5
    Viral >= 0.95: +0.5

Virsorter
    category ==  1,2,4,5: +1
    category == 3,6: +0.5

DeepVirFinder:
    Score >= 0.7: +0.5
    Score >= 0.9: +0.5

Kaiju:
    Kaiju_viral = "cellular organisms": -1
    Kaiju_viral = "Viruses": +1

CheckV
    If %unknown >= 75: +0.5
    Hallmark > 2: +1
    viral_genes == 0 and host_genes >= 1: keep_score = 0
    If 3*viral_genes <= host_genes: keep_score = 0
    If length > 50,000 and hallmark == 0: keep_score = 0
    

This script produces visualizations of these combined viral scorings and
includes ecological metrics like alpha diversity.

You can decide which combination is appropriate for them and only need use the
tools appropriate for your data.

```{r getting_viral_set_1}
getting_viral_set_1 <- function(input_seqs,
                                include_vibrant=FALSE, 
                                include_virsorter2=FALSE,
                                include_deepvirfinder=FALSE,
                                include_tuning=FALSE,
                                include_kaiju=FALSE,
                                include_virsorter=FALSE) {
  
  keep_score <- rep(0, nrow(input_seqs))
  
  if (include_vibrant) {
    keep_score[input_seqs$vibrant_quality=="high quality draft"] <- keep_score[input_seqs$vibrant_quality=="high quality draft"] + 1
    keep_score[input_seqs$vibrant_quality=="medium quality draft"] <- keep_score[input_seqs$vibrant_quality=="medium quality draft"] + 1
    keep_score[input_seqs$vibrant_quality=="low quality draft" & input_seqs$provirus=="Yes"] <- keep_score[input_seqs$vibrant_quality=="low quality draft" & input_seqs$provirus=="Yes"] + 0.5
#    keep_score[input_seqs$vibrant_quality=="low quality draft"] <- keep_score[input_seqs$vibrant_quality=="low quality draft"] + 0.5
  }
  
  if (include_virsorter2) {
    keep_score[input_seqs$viral>=50] <- keep_score[input_seqs$viral>=50] + 0.5
    keep_score[input_seqs$viral>=95] <- keep_score[input_seqs$viral>=95] + 0.5
  }
  
  if (include_virsorter) {
    keep_score[input_seqs$category==1] <- keep_score[input_seqs$category==1] + 1
    keep_score[input_seqs$category==2] <- keep_score[input_seqs$category==2] + 1
    keep_score[input_seqs$category==3] <- keep_score[input_seqs$category==3] + 0.5
    keep_score[input_seqs$category==4] <- keep_score[input_seqs$category==4] + 1
    keep_score[input_seqs$category==5] <- keep_score[input_seqs$category==5] + 1
    keep_score[input_seqs$category==6] <- keep_score[input_seqs$category==6] + 0.5
  }
  
  if (include_deepvirfinder) {
    keep_score[input_seqs$score>=0.7 & input_seqs$contig_length<20000] <- keep_score[input_seqs$score>=0.7 & input_seqs$contig_length<20000] + 0.5
   keep_score[input_seqs$score>=0.9 & input_seqs$contig_length<20000] <- keep_score[input_seqs$score>=0.9 & input_seqs$contig_length<20000] + 0.5
  }
  
  if (include_kaiju) {
    keep_score[input_seqs$Kaiju_Viral=="cellular organisms"] <- keep_score[input_seqs$Kaiju_Viral=="cellular organisms"] - 1
    keep_score[input_seqs$Kaiju_Viral=="Viruses"] <- keep_score[input_seqs$Kaiju_Viral=="Viruses"] + 1
  }
  
  if (include_tuning) {
    keep_score[input_seqs$hallmark>2] <- keep_score[input_seqs$hallmark>2] + 1
    keep_score[input_seqs$percent_unknown>=75] <- keep_score[input_seqs$percent_unknown>=75] + 0.5
    keep_score[input_seqs$percent_viral>=50] <- keep_score[input_seqs$percent_viral>=50] + 0.5
    #keep_score[input_seqs$hallmark>=(input_seqs$checkv_viral_genes/5)] <- keep_score[input_seqs$hallmark>=(input_seqs$checkv_viral_genes/5)] + 1 #add some ratio
    keep_score[input_seqs$viral_genes==0 & input_seqs$host_genes>=1] <- 0
    keep_score[((input_seqs$viral_genes*3) <= input_seqs$host_genes) & input_seqs$checkv_provirus=="No"] <- 0 # consider accounting for provirus designation
#    keep_score[(input_seqs$viral_genes*3) <= input_seqs$host_genes] <- 0 # consider accounting for provirus designation
    keep_score[input_seqs$contig_length>50000 & input_seqs$hallmark==0] <- 0
  }
  
  return(keep_score)
  
}
```

# Assessing performance against the "truth"
note that this is only as accurate as the annotations of the input sequences

this function calculates the precision, recall, and F1 score for each pipeline
```{r}
assess_performance <- function(seqtype, keep_score) {
  
  truepositive <- rep("not viral", length(seqtype))
  truepositive[seqtype=="virus"] <- "viral"
  
  #make confusion matrix
  confusion_matrix <- rep("true negative", length(keep_score))
  confusion_matrix[truepositive=="viral" & keep_score<=1] <- "false negative"
  confusion_matrix[truepositive=="viral" & keep_score>=1] <- "true positive"
  confusion_matrix[truepositive=="not viral" & keep_score>=1] <- "false positive"
  
  TP <- table(confusion_matrix)[4]
  FP <- table(confusion_matrix)[2]
  TN <- table(confusion_matrix)[3]
  FN <- table(confusion_matrix)[1]
  
  precision <- TP/(TP+FP)
  recall <- TP/(TP+FN)
  F1 <- 2*precision*recall/(precision+recall)
  
  MCC <- (TP*TN-FP*FN)/sqrt(as.numeric(TP+FP)*as.numeric(TP+FN)*as.numeric(TN+FP)*as.numeric(TN+FN))
  
  AUC <- round(auc(truepositive, keep_score),4)
  
  #by type metrics
  fungal_FP <- table(confusion_matrix[seqtype=="fungi"])[2]
  bacterial_FP <- table(confusion_matrix[seqtype=="bacteria"])[2]
  viral_FN <- table(confusion_matrix[seqtype=="virus"])[1]
  
  performance <- c(precision, recall, F1, MCC, AUC, fungal_FP, bacterial_FP, viral_FN)
  names(performance) <- c("precision", "recall", "F1", "MCC", "AUC", "fungal_FP",
                          "bacterial_FP", "viral_FN")
  
  return(performance)
}
```

combination of tools list
```{r}
combos_list <- read_csv("combinations_list.csv", col_names = T)
```

this function builds a list of all of the combinations that the user wants to 
test. 
In this case, we're comparing the performance of all unique combinations of the 
six tools.
```{r}
build_score_list <- function(input_seqs, combos) {
  output <- data.frame(precision=rep(0, nrow(combos)),
                       recall=rep(0, nrow(combos)),
                       F1=rep(0, nrow(combos)),
                       MCC=rep(0, nrow(combos)),
                       AUC=rep(0, nrow(combos)),
                       fungal_FP=rep(0, nrow(combos)),
                       bacterial_FP=rep(0, nrow(combos)),
                       viral_FN=rep(0, nrow(combos)))
  for (i in 1:nrow(combos)) {
    keep_score <- getting_viral_set_1(input_seqs, include_vibrant = combos$VIBRANT[i],
                                            include_virsorter = combos$VS[i],
                                            include_virsorter2 = combos$VS2[i],
                                            include_tuning = combos$CheckV[i],
                                            include_kaiju = combos$Kaiju[i],
                                            include_deepvirfinder = combos$DVF[i])
  
    output[i,1:8] <- assess_performance(input_seqs$seqtype, keep_score)
    
    output$toolcombo[i] <- paste(combos$CheckV[i],combos$DVF[i],
                                 combos$Kaiju[i], combos$VIBRANT[i],
                                 combos$VS[i], combos$VS2[i])
  }
  
  output[is.na(output)] <- 0
  
  #return(keep_score)
  return (output)
}
```

## Calculate the performance of each pipeline
```{r}
accuracy_scores <- build_score_list(viruses[viruses$Index==1], combos_list)
accuracy_scores <- data.frame(testing_set_index=rep(0, nrow(combos_list)*10),
                      precision=rep(0, nrow(combos_list)*10),
                       recall=rep(0, nrow(combos_list)*10),
                       F1=rep(0, nrow(combos_list)*10),
                       MCC=rep(0, nrow(combos_list)*10), 
                      AUC=rep(0, nrow(combos_list)*10),
                      fungal_FP=rep(0, nrow(combos_list)*10),
                      bacterial_FP=rep(0, nrow(combos_list)*10),
                      viral_FN=rep(0, nrow(combos_list)*10))
accuracy_scores <- cbind(testing_set_index=rep(1, nrow(combos_list)),
                              build_score_list(viruses[viruses$Index==1], combos_list))
for (i in 2:10) {
  accuracy_scores <- rbind(accuracy_scores,
                           cbind(testing_set_index=rep(i, nrow(combos_list)),
                              build_score_list(viruses[viruses$Index==i], combos_list)))
}
```

```{r}
library("stringr")
```

```{r}
accuracy_scores$numtools <- str_count(accuracy_scores$toolcombo, "1")
#accuracy_scores <- accuracy_scores[order(accuracy_scores$numtools, decreasing=F),]
accuracy_scores <- accuracy_scores[order(accuracy_scores$MCC, decreasing=F),]
accuracy_scores$toolcombo <- factor(accuracy_scores$toolcombo, levels = unique(accuracy_scores$toolcombo))
accuracy_scores$numtools <- as.factor(accuracy_scores$numtools)
```


## Visualize how the precision, recall, and F1 scores change across pipelines.
```{r}
pal <- ggthemes::tableau_color_pal(palette="Tableau 10", type="regular")
p2 <- ggplot(accuracy_scores, aes(x=toolcombo, y=F1, 
                                  color=numtools, fill=numtools)) +
  geom_point(alpha=0.5) +
  theme_light() +
  theme(
    panel.grid.major.y = element_blank(),
    panel.border = element_blank(),
    axis.ticks.y = element_blank(),
    legend.position = "bottom",
    axis.text.y=element_text(size=14),
    axis.text.x=element_text(size=14, angle = 90),
    legend.text=element_text(size=12),
    axis.title=element_text(size=16),
  ) +
  xlab("Tool Combination (CV, DVF, KJ, VB, VS, VS2)") +
  ylab("F1 Score")
p2
ggplot(accuracy_scores, aes(x=toolcombo, y=precision, 
                                  color=numtools, fill=numtools)) +
  geom_point(alpha=0.5) +
  theme_light() +
  theme(
    panel.grid.major.y = element_blank(),
    panel.border = element_blank(),
    axis.ticks.y = element_blank(),
    legend.position = "bottom",
    axis.text.y=element_text(size=14),
    axis.text.x=element_text(size=14, angle = 90),
    legend.text=element_text(size=12),
    axis.title=element_text(size=16),
  ) +
  xlab("Tool Combination (CV, DVF, KJ, VB, VS, VS2)") +
  ylab("Precision")
ggplot(accuracy_scores, aes(x=toolcombo, y=recall, 
                                  color=numtools, fill=numtools)) +
  geom_point(alpha=0.5) +
  theme_light() +
  theme(
    panel.grid.major.y = element_blank(),
    panel.border = element_blank(),
    axis.ticks.y = element_blank(),
    legend.position = "bottom",
    axis.text.y=element_text(size=14),
    axis.text.x=element_text(size=14, angle = 90),
    legend.text=element_text(size=12),
    axis.title=element_text(size=16),
  ) +
  xlab("Tool Combination (CV, DVF, KJ, VB, VS, VS2)") +
  ylab("Recall")
ggplot(accuracy_scores, aes(x=precision, y=recall, 
                                  color=numtools, fill=numtools)) +
  geom_point(alpha=0.5) +
  theme_light() +
  theme(
    panel.grid.major.y = element_blank(),
    panel.border = element_blank(),
    axis.ticks.y = element_blank(),
    legend.position = "bottom",
    axis.text.y=element_text(size=14),
    axis.text.x=element_text(size=14, angle = 90),
    legend.text=element_text(size=12),
    axis.title=element_text(size=16),
  ) +
  xlab("Precision") +
  ylab("Recall")
ggplot(accuracy_scores, aes(x=toolcombo, y=abs(precision-recall), 
                                  color=numtools, fill=numtools)) +
  geom_point(alpha=0.5) +
  theme_light() +
  theme(
    panel.grid.major.y = element_blank(),
    panel.border = element_blank(),
    axis.ticks.y = element_blank(),
    legend.position = "bottom",
    axis.text.y=element_text(size=14),
    axis.text.x=element_text(size=14, angle = 90),
    legend.text=element_text(size=12),
    axis.title=element_text(size=16),
  ) +
  xlab("Tool Combination (CV, DVF, KJ, VB, VS, VS2)") +
  ylab("Precision-Recall")
ggplot(accuracy_scores, aes(x=toolcombo, y=MCC, 
                                  color=numtools, fill=numtools)) +
  geom_point(alpha=0.5) +
  theme_light() +
  theme(
    panel.grid.major.y = element_blank(),
    panel.border = element_blank(),
    axis.ticks.y = element_blank(),
    legend.position = "bottom",
    axis.text.y=element_text(size=14),
    axis.text.x=element_text(size=14, angle = 90),
    legend.text=element_text(size=12),
    axis.title=element_text(size=16),
  ) +
  xlab("Tool Combination (CV, DVF, KJ, VB, VS, VS2)") +
  ylab("MCC")
ggplot(accuracy_scores, aes(x=toolcombo, y=AUC, 
                                  color=numtools, fill=numtools)) +
  geom_point(alpha=0.5) +
  theme_light() +
  theme(
    panel.grid.major.y = element_blank(),
    panel.border = element_blank(),
    axis.ticks.y = element_blank(),
    legend.position = "bottom",
    axis.text.y=element_text(size=14),
    axis.text.x=element_text(size=14, angle = 90),
    legend.text=element_text(size=12),
    axis.title=element_text(size=16),
  ) +
  xlab("Tool Combination (CV, DVF, KJ, VB, VS, VS2)") +
  ylab("AUC")
ggplot(accuracy_scores, aes(x=toolcombo, y=fungal_FP, 
                                  color=numtools, fill=numtools)) +
  geom_point(alpha=0.5) +
  theme_light() +
  theme(
    panel.grid.major.y = element_blank(),
    panel.border = element_blank(),
    axis.ticks.y = element_blank(),
    legend.position = "bottom",
    axis.text.y=element_text(size=14),
    axis.text.x=element_text(size=14, angle = 90),
    legend.text=element_text(size=12),
    axis.title=element_text(size=16),
  ) +
  xlab("Tool Combination (CV, DVF, KJ, VB, VS, VS2)") +
  ylab("Fungal False Positives")
ggplot(accuracy_scores, aes(x=toolcombo, y=bacterial_FP, 
                                  color=numtools, fill=numtools)) +
  geom_point(alpha=0.5) +
  theme_light() +
  theme(
    panel.grid.major.y = element_blank(),
    panel.border = element_blank(),
    axis.ticks.y = element_blank(),
    legend.position = "bottom",
    axis.text.y=element_text(size=14),
    axis.text.x=element_text(size=14, angle = 90),
    legend.text=element_text(size=12),
    axis.title=element_text(size=16),
  ) +
  xlab("Tool Combination (CV, DVF, KJ, VB, VS, VS2)") +
  ylab("Bacterial False Positives")
ggplot(accuracy_scores, aes(x=toolcombo, y=viral_FN, 
                                  color=numtools, fill=numtools)) +
  geom_point(alpha=0.5) +
  theme_light() +
  theme(
    panel.grid.major.y = element_blank(),
    panel.border = element_blank(),
    axis.ticks.y = element_blank(),
    legend.position = "bottom",
    axis.text.y=element_text(size=14),
    axis.text.x=element_text(size=14, angle = 90),
    legend.text=element_text(size=12),
    axis.title=element_text(size=16),
  ) +
  xlab("Tool Combination (CV, DVF, KJ, VB, VS, VS2)") +
  ylab("Viral False Negatives")
```






# Experimenting

## high precision example
```{r}
viruses$keep_score_high_precision <- getting_viral_set_1(viruses, include_deepvirfinder = F,
                                              include_vibrant = T,
                                              include_virsorter2 = F,
                                              include_kaiju = T,
                                              include_tuning = T,
                                              include_virsorter = F)
```


```{r}
viruses$confusion_matrix_high_precision <- "true negative"
viruses$confusion_matrix_high_precision[viruses$seqtype=="virus" & viruses$keep_score_high_precision<=1] <- "false negative"
viruses$confusion_matrix_high_precision[viruses$seqtype=="virus" & viruses$keep_score_high_precision>=1] <- "true positive"
viruses$confusion_matrix_high_precision[viruses$seqtype!="virus" & viruses$keep_score_high_precision>=1] <- "false positive"
```

visualizing confusion matrix by taxa
```{r}
confusion_by_taxa <- melt(table(viruses$confusion_matrix_high_precision, viruses$seqtype, viruses$Index))
colnames(confusion_by_taxa) <- c("confusion_matrix", "seqtype","Index", "count")
```



```{r}
pal <- ggthemes::tableau_color_pal(palette="Tableau 10", type="regular")
```

```{r}
ggplot(confusion_by_taxa, aes(x=count, y=as.factor(Index),
                   fill=confusion_matrix,
                   color=confusion_matrix)) +
  geom_bar(stat="identity") +
  theme_light() +
  theme(
    panel.grid.major.y = element_blank(),
    panel.border = element_blank(),
    axis.ticks.y = element_blank(),
    legend.position = "bottom",
    axis.text.y=element_text(size=14),
    axis.text.x=element_text(size=14),
    legend.text=element_text(size=12),
    axis.title=element_text(size=16),
  ) +
  scale_fill_manual(name="",
                     values = alpha(rev(pal(4)), 0.5),
                    labels=c("false negative", "false positive", 
                             "true negative", "true positive")) +
  scale_color_manual(name="",
                     values = alpha(rev(pal(4)), 1),
                    labels=c("false negative", "false positive", 
                             "true negative", "true positive")) +
  xlab("Number of Sequences") +
  ylab("") + 
  facet_wrap(~seqtype, scales = "free") +
  coord_flip()
```

```{r}
 ggplot(viruses, aes(x=checkv_viral_genes, y=confusion_matrix_high_precision,
                   fill=confusion_matrix_high_precision,
                   color=confusion_matrix_high_precision)) +
  geom_boxplot(alpha=0.3) +
  theme_light() +
  theme(
    panel.grid.major.y = element_blank(),
    panel.border = element_blank(),
    axis.ticks.y = element_blank(),
    legend.position = "bottom",
    axis.text.y=element_text(size=14),
    axis.text.x=element_text(size=14),
    legend.text=element_text(size=12),
    axis.title=element_text(size=16),
  ) +
  scale_fill_manual(name="",
                     values = alpha(rev(pal(4)), 0.5),
                    labels=c("false negative", "false positive", 
                             "true negative", "true positive")) +
  scale_color_manual(name="",
                     values = alpha(rev(pal(4)), 1),
                    labels=c("false negative", "false positive", 
                             "true negative", "true positive")) +
  xlab("Number of Viral Sequences") +
  ylab("") + 
  facet_wrap(~seqtype, scales = "free") +
  coord_flip()

 ggplot(viruses, aes(x=percent_viral, y=confusion_matrix_high_precision,
                   fill=confusion_matrix_high_precision,
                   color=confusion_matrix_high_precision)) +
  geom_boxplot(alpha=0.3) +
  theme_light() +
  theme(
    panel.grid.major.y = element_blank(),
    panel.border = element_blank(),
    axis.ticks.y = element_blank(),
    legend.position = "bottom",
    axis.text.y=element_text(size=14),
    axis.text.x=element_text(size=14),
    legend.text=element_text(size=12),
    axis.title=element_text(size=16),
  ) +
  scale_fill_manual(name="",
                     values = alpha(rev(pal(4)), 0.5),
                    labels=c("false negative", "false positive", 
                             "true negative", "true positive")) +
  scale_color_manual(name="",
                     values = alpha(rev(pal(4)), 1),
                    labels=c("false negative", "false positive", 
                             "true negative", "true positive")) +
  xlab("Percent Genes Viral") +
  ylab("") + 
  facet_wrap(~seqtype, scales = "free") +
  coord_flip()

 ggplot(viruses, aes(x=hallmark, y=confusion_matrix_high_precision,
                   fill=confusion_matrix_high_precision,
                   color=confusion_matrix_high_precision)) +
  geom_boxplot(alpha=0.3) +
  theme_light() +
  theme(
    panel.grid.major.y = element_blank(),
    panel.border = element_blank(),
    axis.ticks.y = element_blank(),
    legend.position = "bottom",
    axis.text.y=element_text(size=14),
    axis.text.x=element_text(size=14),
    legend.text=element_text(size=12),
    axis.title=element_text(size=16),
  ) +
  scale_fill_manual(name="",
                     values = alpha(rev(pal(4)), 0.5),
                    labels=c("false negative", "false positive", 
                             "true negative", "true positive")) +
  scale_color_manual(name="",
                     values = alpha(rev(pal(4)), 1),
                    labels=c("false negative", "false positive", 
                             "true negative", "true positive")) +
  xlab("Number of Hallmark Genes") +
  ylab("") + 
  facet_wrap(~seqtype, scales = "free") +
  coord_flip()
 
ggplot(viruses, aes(x=hallmark, y=checkv_viral_genes,
                   fill=confusion_matrix_high_precision,
                   color=confusion_matrix_high_precision)) +
  geom_point(alpha=0.3) +
  theme_light() +
  theme(
    panel.grid.major.y = element_blank(),
    panel.border = element_blank(),
    axis.ticks.y = element_blank(),
    legend.position = "bottom",
    axis.text.y=element_text(size=14),
    axis.text.x=element_text(size=14),
    legend.text=element_text(size=12),
    axis.title=element_text(size=16),
  ) +
  scale_fill_manual(name="",
                     values = alpha(rev(pal(4)), 0.5),
                    labels=c("false negative", "false positive", 
                             "true negative", "true positive")) +
  scale_color_manual(name="",
                     values = alpha(rev(pal(4)), 1),
                    labels=c("false negative", "false positive", 
                             "true negative", "true positive")) +
  xlab("Number of Hallmark Genes") +
  ylab("Number of Viral Genes") + 
  facet_wrap(~seqtype, scales = "free") +
  coord_flip()
```

```{r}
viruses_false_positive <- viruses[viruses$confusion_matrix_high_precision=="false positive",]
viruses_false_negative <- viruses[viruses$confusion_matrix_high_precision=="false negative",]
```

```{r}
ggplot(viruses, aes(x=hallmark, y=checkv_viral_genes,
                   fill=checkv_length,
                   color=checkv_length,
                   shape=checkv_provirus)) +
  geom_point(alpha=0.3) +
  theme_light() +
  theme(
    panel.grid.major.y = element_blank(),
    panel.border = element_blank(),
    axis.ticks.y = element_blank(),
    legend.position = "bottom",
    axis.text.y=element_text(size=14),
    axis.text.x=element_text(size=14),
    legend.text=element_text(size=12),
    axis.title=element_text(size=16),
  ) +
  xlab("Number of Hallmark Genes") +
  ylab("Number of Viral Genes") + 
  facet_wrap(~seqtype, scales = "free") +
  coord_flip()

ggplot(viruses_false_positive, aes(x=hallmark, y=checkv_length,
                   fill=checkv_viral_genes,
                   color=checkv_viral_genes,
                   shape=checkv_provirus)) +
  geom_point(alpha=0.3) +
  theme_light() +
  theme(
    panel.grid.major.y = element_blank(),
    panel.border = element_blank(),
    axis.ticks.y = element_blank(),
    legend.position = "bottom",
    axis.text.y=element_text(size=14),
    axis.text.x=element_text(size=14),
    legend.text=element_text(size=12),
    axis.title=element_text(size=16),
  ) +
  xlab("Number of Hallmark Genes") +
  ylab("Contig Length") + 
  facet_wrap(~seqtype, scales = "free") +
  coord_flip()

ggplot(viruses_false_positive[viruses_false_positive$seqtype=="bacteria"], aes(x=hallmark, y=checkv_length,
                   fill=checkv_viral_genes,
                   color=checkv_viral_genes,
                   shape=checkv_provirus)) +
  geom_point(alpha=0.3) +
  theme_light() +
  theme(
    panel.grid.major.y = element_blank(),
    panel.border = element_blank(),
    axis.ticks.y = element_blank(),
    legend.position = "bottom",
    axis.text.y=element_text(size=14),
    axis.text.x=element_text(size=14),
    legend.text=element_text(size=12),
    axis.title=element_text(size=16),
  ) +
  xlab("Number of Hallmark Genes") +
  ylab("Contig Length") + 
  facet_wrap(~Kaiju_Viral, scales = "free") +
  coord_flip()

ggplot(viruses_false_positive[viruses_false_positive$seqtype=="fungi"], aes(x=hallmark, y=checkv_length,
                   fill=keep_score_high_precision,
                   color=keep_score_high_precision,
                   shape=checkv_provirus)) +
  geom_point(alpha=0.3) +
  theme_light() +
  theme(
    panel.grid.major.y = element_blank(),
    panel.border = element_blank(),
    axis.ticks.y = element_blank(),
    legend.position = "bottom",
    axis.text.y=element_text(size=14),
    axis.text.x=element_text(size=14),
    legend.text=element_text(size=12),
    axis.title=element_text(size=16),
  ) +
  xlab("Number of Hallmark Genes") +
  ylab("Contig Length") + 
  facet_wrap(~Kaiju_Viral, scales = "free") +
  coord_flip()

ggplot(viruses_false_positive[viruses_false_positive$seqtype=="protist"], aes(x=hallmark, y=checkv_length,
                   fill=checkv_viral_genes,
                   color=checkv_viral_genes,
                   shape=checkv_provirus)) +
  geom_point(alpha=0.3) +
  theme_light() +
  theme(
    panel.grid.major.y = element_blank(),
    panel.border = element_blank(),
    axis.ticks.y = element_blank(),
    legend.position = "bottom",
    axis.text.y=element_text(size=14),
    axis.text.x=element_text(size=14),
    legend.text=element_text(size=12),
    axis.title=element_text(size=16),
  ) +
  xlab("Number of Hallmark Genes") +
  ylab("Contig Length") + 
  facet_wrap(~Kaiju_Viral, scales = "free") +
  coord_flip()

ggplot(viruses_false_negative, aes(x=hallmark, y=checkv_length,
                   fill=checkv_viral_genes,
                   color=checkv_viral_genes,
                   shape=checkv_provirus)) +
  geom_point(alpha=0.3) +
  theme_light() +
  theme(
    panel.grid.major.y = element_blank(),
    panel.border = element_blank(),
    axis.ticks.y = element_blank(),
    legend.position = "bottom",
    axis.text.y=element_text(size=14),
    axis.text.x=element_text(size=14),
    legend.text=element_text(size=12),
    axis.title=element_text(size=16),
  ) +
  xlab("Number of Hallmark Genes") +
  ylab("Contig Length") + 
  facet_wrap(~Kaiju_Viral, scales = "free") +
  coord_flip()

ggplot(viruses_false_negative, aes(x=hallmark, y=checkv_length,
                   fill=keep_score_high_precision,
                   color=keep_score_high_precision,
                   shape=checkv_provirus)) +
  geom_point(alpha=0.3) +
  theme_light() +
  theme(
    panel.grid.major.y = element_blank(),
    panel.border = element_blank(),
    axis.ticks.y = element_blank(),
    legend.position = "bottom",
    axis.text.y=element_text(size=14),
    axis.text.x=element_text(size=14),
    legend.text=element_text(size=12),
    axis.title=element_text(size=16),
  ) +
  xlab("Number of Hallmark Genes") +
  ylab("Contig Length") + 
  facet_wrap(~Kaiju_Viral, scales = "free") +
  coord_flip()
```




```{r}
table(viruses$hallmark[viruses$confusion_matrix_high_precision=="false positive"]>0)

table(viruses$percent_host[viruses$confusion_matrix_high_precision=="false positive"]<50)
```



## high recall example
```{r}
viruses$keep_score_high_recall <- getting_viral_set_1(viruses, include_deepvirfinder = F,
                                              include_vibrant = T,
                                              include_virsorter2 = F,
                                              include_kaiju = T,
                                              include_tuning = T,
                                              include_virsorter = T)
```


```{r}
viruses$confusion_matrix <- "true negative"
viruses$confusion_matrix[viruses$seqtype=="virus" & viruses$keep_score_high_recall<=1] <- "false negative"
viruses$confusion_matrix[viruses$seqtype=="virus" & viruses$keep_score_high_recall>=1] <- "true positive"
viruses$confusion_matrix[viruses$seqtype!="virus" & viruses$keep_score_high_recall>=1] <- "false positive"
```

visualizing confusion matrix by taxa
```{r}
confusion_by_taxa <- melt(table(viruses$confusion_matrix, viruses$seqtype, viruses$Index))
colnames(confusion_by_taxa) <- c("confusion_matrix", "seqtype","Index", "count")
```



```{r}
pal <- ggthemes::tableau_color_pal(palette="Tableau 10", type="regular")
```

```{r}
p2 <- ggplot(confusion_by_taxa, aes(x=count, y=as.factor(Index),
                   fill=confusion_matrix,
                   color=confusion_matrix)) +
  geom_bar(stat="identity") +
  theme_light() +
  theme(
    panel.grid.major.y = element_blank(),
    panel.border = element_blank(),
    axis.ticks.y = element_blank(),
    legend.position = "bottom",
    axis.text.y=element_text(size=14),
    axis.text.x=element_text(size=14),
    legend.text=element_text(size=12),
    axis.title=element_text(size=16),
  ) +
  scale_fill_manual(name="",
                     values = alpha(rev(pal(4)), 0.5),
                    labels=c("false negative", "false positive", 
                             "true negative", "true positive")) +
  scale_color_manual(name="",
                     values = alpha(rev(pal(4)), 1),
                    labels=c("false negative", "false positive", 
                             "true negative", "true positive")) +
  xlab("Number of Sequences") +
  ylab("") + 
  facet_wrap(~seqtype, scales = "free") +
  coord_flip()
p2
```


# Visualizing confusion matrix by number of tools


```{r}
viruses$keep_score_visualize <- viruses$keep_score
viruses$keep_score_visualize[viruses$keep_score>1] <- ">1"
viruses$keep_score_visualize <- factor(viruses$keep_score_visualize, 
                                       levels=c("-0.5", "-1", "0", "0.5","1", ">1"))
viruses$keep_score_visualize <- factor(viruses$keep_score_visualize, 
                                       labels=c("≤ 0", "≤ 0", "≤ 0", "0.5","1", "> 1"))
```

```{r}
levels(factor(viruses$keep_score_visualize))
```


```{r}
pal <- ggthemes::tableau_color_pal(palette="Tableau 20", type="regular")
p1 <- ggplot(viruses, aes(x=as.factor(Index),
                   fill=keep_score_visualize, color=keep_score_visualize)) +
  geom_bar(stat="count", position="stack") +
  theme_light() +
  coord_flip() +
  theme(
    panel.grid.major.y = element_blank(),
    panel.border = element_blank(),
    axis.ticks.y = element_blank(),
    legend.position = "bottom",
    axis.text.y=element_text(size=14),
    axis.text.x=element_text(size=14),
    legend.text=element_text(size=12),
    axis.title=element_text(size=16)
  ) +
  scale_color_manual(name = 'Viral Score',
                     values = alpha(c(pal(4)), 1)) +
  scale_fill_manual(name = 'Viral Score',
                     values = alpha(c(pal(4)), 0.5)) +
  xlab("Index") +
  ylab("Sequence Count") +
  facet_wrap(~confusion_matrix, scales = "free")
p1
```


# ROC 
```{r}
library(pROC)
```

```{r}
viruses$truepositive <- rep(0, nrow(viruses))
viruses$truepositive[viruses$seqtype=="virus"] <- 1
```


```{r}
rocobj <- roc(viruses$truepositive, viruses$keep_score)
rocobj_all <- roc(viruses$truepositive, viruses$keep_score_all)
AUC <- round(auc(viruses$truepositive, viruses$keep_score),4)
auc_all <- round(auc(viruses$truepositive, viruses$keep_score_all),4)
#create ROC plot
ggroc(rocobj, colour = 'steelblue', size = 2) +
  ggtitle(paste0('ROC Curve ', '(AUC = ', AUC, ')')) +
  coord_equal()
ggroc(rocobj_all, colour = 'green', size = 2) +
  ggtitle(paste0('ROC Curve ', '(AUC = ', auc_all, ')'))
```
Sensitivity: The probability that the model predicts a positive outcome for an observation when indeed the outcome is positive.
Specificity: The probability that the model predicts a negative outcome for an observation when indeed the outcome is negative.

## Pull out Fungi

```{r}
fungi_1 <- viruses[(viruses$seqtype == "fungi") & (viruses$Kaiju_Viral == "Viruses")]
vb_1 <- read_tsv("/Users/riddellj/Documents/Research/VSTE/Troubleshoot_Fungi/VIBRANT_annotations_metagenomic_testing_set_1.tsv")
vb_1 <- separate(vb_1, col = "scaffold", into = c("seqtype", "contig"), sep="--")
vb_1$contig <- sub("\\.", "_", vb_1$contig)
vb_1_vb_c <- inner_join(vb_1, vb_c, on='contig')
vb_1_vb_c_fungi <- vb_1_vb_c %>% filter(seqtype == "fungi")

fungi_1_vb_1_vb_c_fungi <- inner_join(fungi_1,vb_1_vb_c_fungi, on='contig')

                                      
#Examples of "false positive" fungi
# https://www.ncbi.nlm.nih.gov/nuccore/599124765

```